# download_noisemap_issue.py
# NP (Noise Mapping) Downloader
# Downloads PDFs from a single Noise Mapping issue
# Skips "Editorial" section
# Creates folder based on Volume/Issue/Year
# Logs all results in CSV

import os
import re
import csv
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin

# ---------- Helpers ----------
def sanitize_filename(name):
    return re.sub(r'[\\/*?:"<>|]', "", name.strip())

def extract_issue_metadata(title_text):
    """
    Example: 'Noise Mapping Volume 7, Issue 1'
    """
    vol_match = re.search(r'Volume\s+(\d+)', title_text, re.I)
    issue_match = re.search(r'Issue\s+(\d+)', title_text, re.I)
    year_match = re.search(r'(\d{4})', title_text)
    vol = vol_match.group(1) if vol_match else "Vol"
    iss = issue_match.group(1) if issue_match else "Issue"
    year = year_match.group(1) if year_match else "Year"
    return f"NoiseMap_Vol{vol}_Issue{iss}_{year}"

# ---------- Input ----------
issue_url = input("Enter Noise Mapping issue URL: ").strip()
headers = {"User-Agent": "Mozilla/5.0"}

print(f"[INFO] Fetching: {issue_url}")
resp = requests.get(issue_url, headers=headers)
resp.raise_for_status()
soup = BeautifulSoup(resp.text, "html.parser")

# ---------- Folder ----------
title_tag = soup.find("title")
folder_name = extract_issue_metadata(title_tag.text if title_tag else "")
os.makedirs(folder_name, exist_ok=True)

log_path = os.path.join(folder_name, f"{folder_name}_log.csv")
log_file = open(log_path, "w", newline="", encoding="utf-8")
csv_writer = csv.writer(log_file)
csv_writer.writerow(["Title", "PDF URL", "Status"])

count_downloaded = 0
count_skipped = 0

# ---------- Parse issue content ----------
current_section = None
for el in soup.select("div.issueSubjectGroupHeading, li.ahead-issue"):
    # Section heading
    if "issueSubjectGroupHeading" in el.get("class", []):
        current_section = el.get_text(strip=True)
        continue

    # Skip Editorial section
    if current_section and "editorial" in current_section.lower():
        title_tag = el.select_one("a.text-dark .ahead-of-print-title")
        title = title_tag.get_text(strip=True) if title_tag else "[No title]"
        print(f"[SKIP] Editorial: {title}")
        csv_writer.writerow([title, "", "Skipped (Editorial)"])
        count_skipped += 1
        continue

    # Extract title
    title_tag = el.select_one("a.text-dark .ahead-of-print-title")
    if not title_tag:
        continue
    title = title_tag.get_text(strip=True)

    # Extract PDF link
    pdf_tag = el.select_one("a.btn-abstract-download-dgb")
    if not pdf_tag:
        print(f"[SKIP] No PDF found for: {title}")
        csv_writer.writerow([title, "", "No PDF"])
        count_skipped += 1
        continue
    pdf_url = urljoin(issue_url, pdf_tag["href"])

    # Download PDF
    try:
        safe_title = sanitize_filename(title)
        pdf_path = os.path.join(folder_name, f"{safe_title}.pdf")
        print(f"[{count_downloaded+1}] Downloading: {safe_title}")
        r = requests.get(pdf_url, headers=headers)
        r.raise_for_status()
        with open(pdf_path, "wb") as f:
            f.write(r.content)
        csv_writer.writerow([title, pdf_url, "OK"])
        count_downloaded += 1
    except Exception as e:
        print(f"[ERROR] {title} - {e}")
        csv_writer.writerow([title, pdf_url, f"Error: {e}"])
        count_skipped += 1

log_file.close()
print(f"\n✅ Done! {count_downloaded} PDFs saved in '{folder_name}'")
print(f"⚠️ Skipped: {count_skipped} articles")
print(f"📄 Log file created: {log_path}")
